# Who Do You Trust? - Daniel Goldstein and Johannes Wiedemann
# R-script for MRP-related Data Preparation


#====================================================================================================
# load packages

library(tidyverse)
library(readxl)
library(lfe)
library(interactions)
library(estimatr)
library(openxlsx)
library(texreg)
library(dotwhisker)
library(stargazer)
library(cobalt)
library(stringi)
library("arm")
library("foreign")


#=======================================================================

# Change the working directory as appropriate
setwd("~/../GoldsteinWiedemann_supplementary/Data and Data Preparation")



#=====================================================================================================
#=====================================================================================================

# Data Preparation for Multi-Level Regression and Poststratification (MRP)

# Note: this code follows closely the one provided by Kastellec et al. (2019): https://scholar.princeton.edu/sites/default/files/jkastellec/files/mrp_primer_replication_files.zip


#=====================================================================================================
# Read in data

#read in survey data (from ANES)
library(haven)
anes_timeseries_2016 <- read_dta("Raw Data/anes_timeseries_2016_Stata13.dta")

# outcome variables of interest: V161218 ("How many people running government are corrupt?"): will be used for MRP for Trust in Government, 
# and V161219 ("Generally speaking, how often can you trust other people?"): will be used for MRP for Social Trust



`%notin%` <- Negate(`%in%`)

# prepare the survey data so they can be used for analysis (e.g. drop missing and 'don't want to answer' answers)
anes_timeseries_2016 <- anes_timeseries_2016 %>% 
  mutate(State=as.character(V163001b)) %>% 
  mutate(State_initnum=ifelse(State=="CA",5,ifelse(State=="VA",46,ifelse(State=="NV",34,ifelse(State=="MD",21,ifelse(State=="NY",35,ifelse(State=="SC",41,ifelse(State=="NJ",32,ifelse(State=="NC",28,ifelse(State=="AZ",4,ifelse(State=="CT",7,ifelse(State=="OH",36,ifelse(State=="OR",38,ifelse(State=="KY",18,ifelse(State=="TN",43,ifelse(State=="DC",8,ifelse(State=="WI",49,ifelse(State=="TX",44,ifelse(State=="AL",2,ifelse(State=="IL",15,ifelse(State=="MN",24,ifelse(State=="MA",20,ifelse(State=="FL",10,ifelse(State=="VT",47,ifelse(State=="CO",6,ifelse(State=="MI",23,ifelse(State=="ND",29,ifelse(State=="WA",48,ifelse(State=="NE",30,ifelse(State=="LA",19,ifelse(State=="NH",31,ifelse(State=="MO",25,ifelse(State=="UT",45,ifelse(State=="MT",27,ifelse(State=="ID",14,ifelse(State=="GA",11,ifelse(State=="OK",37,ifelse(State=="AR",3,ifelse(State=="WY",51,ifelse(State=="WV",50,ifelse(State=="KS",17,ifelse(State=="PA",39,ifelse(State=="IN",16,ifelse(State=="NM",33,ifelse(State=="ME",22,ifelse(State=="RI",40,ifelse(State=="IA",13,ifelse(State=="MS",26,ifelse(State=="SD",42,ifelse(State=="AK",1,NA)))))))))))))))))))))))))))))))))))))))))))))))))) %>%
  mutate(State_initnum=ifelse(State=="HI",12,ifelse(State=="DE",9,State_initnum)))%>%
  filter(as.numeric(V165745) %notin% c(-2,-5,-7)) %>% mutate(Hisp=ifelse(as.numeric(V165745)==1,1,0))%>%
  filter(as.numeric(V165746) %notin% c(-2,-5)) %>% mutate(White=ifelse(as.numeric(V165746)==1,1,0))%>%
  filter(as.numeric(V165747) %notin% c(-2,-5)) %>% mutate(Black=ifelse(as.numeric(V165747)==1,1,0))%>%
  filter(as.numeric(V161342) %notin% c(3,-8,-9)) %>% mutate(Female=ifelse(as.numeric(V161342)==2,1,0))%>%
  filter(as.numeric(V161270) %notin% c(-8,-9,90,95)) %>% mutate(Educ=ifelse(as.numeric(V161270)<9,1,ifelse(as.numeric(V161270)<13,2,ifelse(as.numeric(V161270)==13,3,4))))%>%
  filter(as.numeric(V161267) %notin% c(-8,-9,90)) %>% mutate(Age=ifelse(as.numeric(V161267)<34,1,ifelse(as.numeric(V161267)<50,2,ifelse(as.numeric(V161267)==64,3,4))))%>%
  mutate(Race=ifelse(White==1,1,ifelse(Black==1,2,3)))%>%
  filter(as.numeric(V161215) %notin% c(-8,-9)) %>% mutate(TrustWash=ifelse(as.numeric(V161215)<4,1,0))%>%
  filter(as.numeric(V161219) %notin% c(-8,-9)) %>% mutate(TrustOthers=ifelse(as.numeric(V161219)<4,1,0))%>%
  filter(as.numeric(V161218) %notin% c(-8,-9)) %>% mutate(NoCorruption=ifelse(as.numeric(V161218)>2,1,0))%>%
  mutate(Region=ifelse(State=="DC","dc",ifelse(as.character(V163003)=="3","south",ifelse(as.character(V163003)=="4","west",ifelse(as.character(V163003)=="1","northeast",ifelse(as.character(V163003)=="2","midwest",NA))))))



# read in state-level dataset (taken from Kastellec et al. (2019): https://scholar.princeton.edu/sites/default/files/jkastellec/files/mrp_primer_replication_files.zip)

Statelevel <- read.dta("Raw Data/state_level_update.dta",convert.underscore = TRUE)
Statelevel <- Statelevel[order(Statelevel$sstate.initnum),]

# read in Census data (taken from Kastellec et al. (2019))
Census <- read.dta("Raw Data/poststratification 2000.dta",convert.underscore = TRUE)
Census <- Census[order(Census$cstate),]
Census$cstate.initnum <-  match(Census$cstate, Statelevel$sstate)



#==========================================================================================
#==========================================================================================
# Create index variables

# For Survey-Level Data

anes_timeseries_2016$race.female <- (anes_timeseries_2016$Female *3) + anes_timeseries_2016$Race
anes_timeseries_2016$age.edu.cat <- 4 * (anes_timeseries_2016$Age -1) + anes_timeseries_2016$Educ
anes_timeseries_2016$p.democrat.full <- Statelevel$pdemocrat[anes_timeseries_2016$State_initnum]# 
anes_timeseries_2016$p.ideology.full <-Statelevel$ideology[anes_timeseries_2016$State_initnum]# 
anes_timeseries_2016$p.kerry.full <- Statelevel$kerry.04[anes_timeseries_2016$State_initnum]

#==========================================================================================


# For Census-Level Data

Census$crace.female <- (Census$cfemale *3) + Census$crace.WBH 
Census$cage.edu.cat <- 4 * (Census$cage.cat -1) + Census$cedu.cat 
Census$cp.democrat.full<-  Statelevel$pdemocrat[Census$cstate.initnum]
Census$cp.ideology.full <- Statelevel$ideology[Census$cstate.initnum]
Census$cp.kerry.full <-  Statelevel$kerry.04[Census$cstate.initnum]



#==========================================================================================
#==========================================================================================

# Run individual-level opinion model for ANES question on Trust in Others ("Generally speaking, how often can you trust other people?")

# We predict Trust in Others by sex, age, state, partisanship, ideology, and Democratic vote share in 2004


individual.model <- glmer(formula = TrustOthers ~ (1|race.female) 
                          + (1|age.edu.cat) + (1|State)  + p.democrat.full +p.ideology.full
                          + p.kerry.full,data=anes_timeseries_2016, family=binomial(link="logit"))



# Create vector of state ranefs and then fill in missing ones

state.ranefs <- array(NA,c(51,1))
dimnames(state.ranefs) <-  list(c(Statelevel$sstate),"effect")
for(i in Statelevel$sstate){
  state.ranefs[i,1] <- ranef(individual.model)$State[i,1]
}
state.ranefs[,1][is.na(state.ranefs[,1])] <- 0 #set states with missing REs (b/c not in data) to zero


# Create a prediction for each cell in Census data for the Social Trust Variable
cellpred <- invlogit( fixef(individual.model)["(Intercept)"]
                      + ranef(individual.model)$race.female[Census$crace.female,1]
                      + ranef(individual.model)$age.edu.cat[Census$cage.edu.cat,1]
                      + state.ranefs[Census$cstate,1]
                      + (fixef(individual.model)["p.democrat.full"] *Census$cp.democrat.full)
                      + (fixef(individual.model)["p.ideology.full"] *Census$cp.ideology.full)
                      + (fixef(individual.model)["p.kerry.full"] *Census$cp.kerry.full)
)

# Weight the prediction by the frequency of cell                                       
cellpredweighted <- cellpred * Census$cpercent.state

# Calculate the percent within each state (weighted average of responses)
statepred <- 100* as.vector(tapply(cellpredweighted,Census$cstate,sum))

trust_vector2 <- as.data.frame(cbind(Statelevel$sstate,statepred))

trust_vector2 <- rename(trust_vector2,TrustOthers=statepred)



#==========================================================================================

# Run individual-level opinion model for ANES question on Trust in Government ("How many people running government are corrupt?")

# We predict Trust in Government (as measured by the above ANES question) by sex, age, state, partisanship, ideology, and Democratic vote share in 2004


individual.model <- glmer(formula = NoCorruption ~ (1|race.female) 
                          + (1|age.edu.cat) + (1|State)  + p.democrat.full +p.ideology.full
                          + p.kerry.full,data=anes_timeseries_2016, family=binomial(link="logit"))

#create vector of state ranefs and then fill in missing ones
state.ranefs <- array(NA,c(51,1))
dimnames(state.ranefs) <-  list(c(Statelevel$sstate),"effect")
for(i in Statelevel$sstate){
  state.ranefs[i,1] <- ranef(individual.model)$State[i,1]
}
state.ranefs[,1][is.na(state.ranefs[,1])] <- 0 #set states with missing REs (b/c not in data) to zero


#create a prediction for each cell in Census data for the Political Trust Variable
cellpred <- invlogit( fixef(individual.model)["(Intercept)"]
                      + ranef(individual.model)$race.female[Census$crace.female,1]
                      + ranef(individual.model)$age.edu.cat[Census$cage.edu.cat,1]
                      + state.ranefs[Census$cstate,1]
                      + (fixef(individual.model)["p.democrat.full"] *Census$cp.democrat.full)
                      + (fixef(individual.model)["p.ideology.full"] *Census$cp.ideology.full)
                      + (fixef(individual.model)["p.kerry.full"] *Census$cp.kerry.full)
)

#weights the prediction by the freq of cell                                       
cellpredweighted <- cellpred * Census$cpercent.state

#calculates the percent within each state (weighted average of responses)
statepred <- 100* as.vector(tapply(cellpredweighted,Census$cstate,sum))

trust_vector3 <- as.data.frame(cbind(Statelevel$sstate,statepred))

trust_vector3 <- rename(trust_vector3,NoCorruption=statepred)



#==========================================================================================


# Detach the previously used packages

detach("package:arm", unload=TRUE)
detach("package:MASS", unload=TRUE)
library(dplyr)
library(tidyverse)


#==========================================================================================


# Read in data used for MRP: covariates on the state level (assemble state-level covariates from The Atlas of Rural and Small Town America)



data("state")

statenames_new <- c(state.name,"District of Columbia")

# State-Level Income Covariates
ruralAtlas1 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 6)
ruralAtlas1 <- rename(ruralAtlas1,State_abb=State)
ruralAtlas1 <- ruralAtlas1 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))
ruralAtlas1 <- ruralAtlas1 %>% filter(County %in% statenames_new)
ruralAtlas1 <- ruralAtlas1 %>% filter(stri_sub(as.character(FIPS),3,5)=="000")

# State-Level Population Covariates
ruralAtlas2 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 3)
ruralAtlas2 <- rename(ruralAtlas2,State_abb=State)
ruralAtlas2 <- ruralAtlas2 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))
ruralAtlas2 <- ruralAtlas2 %>% filter(County %in% statenames_new)
ruralAtlas2 <- ruralAtlas2 %>% filter(stri_sub(as.character(FIPS),3,5)=="000")


# State-Level Employment Covariates
ruralAtlas4 <- read.xlsx("Raw Data/Copy of RuralAtlasData20.xlsx",sheet = 4)
ruralAtlas4 <- rename(ruralAtlas4,State_abb=State)
ruralAtlas4 <- ruralAtlas4 %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))
ruralAtlas4 <- ruralAtlas4 %>% filter(County %in% statenames_new)
ruralAtlas4 <- ruralAtlas4 %>% filter(stri_sub(as.character(FIPS),3,5)=="000")


# State-Level Covariates on e.g. marriage rates, organization memberships, religious association memberships

socCapital_alt_3 <- read.xlsx("Raw Data/Copy of social-capital-project-social-capital-index-data.xlsx",sheet = 7)
socCapital_alt_3 <- socCapital_alt_3[,-c(1:4,7)] 
socCapital_alt_3 <- socCapital_alt_3 %>% mutate(County_Name=paste0(County," County"))
socCapital_alt_3 <- rename(socCapital_alt_3,State_abb=State.Abbreviation)
socCapital_alt_3 <- socCapital_alt_3[,-2]


# State-Level Social Capital Index (by JEC), as well as its individual components

socCapital_state <- read.xlsx("Raw Data/Copy of social-capital-project-social-capital-index-data.xlsx",sheet = 2)
socCapital_state <- socCapital_state[,c(3:5)] 
socCapital_state <- socCapital_state[-1,] 
socCapital_state <- rename(socCapital_state,StatenameSocCap=X3,StateAbb=X4,StateSocCap=Index)


# add variable that denotes whether governor in given state is Republican (handcoded from https://www.nga.org/)

governor_2020_Rep <- cbind.data.frame(as.vector(setdiff(unique(socCapital_alt_3$State_abb),"DC")),c(1,1,1,1,0,0,0,0,1,1,0,1,0,1,1,0,0,0,0,1,1,0,0,1,1,0,1,0,1,0,0,0,0,1,1,1,0,0,0,1,1,1,1,1,1,0,0,1,0,1))
governor_2020_Rep <- rename(governor_2020_Rep,State_abb=`as.vector(setdiff(unique(socCapital_alt_3$State_abb), "DC"))`,Republican_gov_2020=`c(1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, `)


# add variable that denotes the implementation date of stay-at-home orders by state (taken from https://www.cnn.com/2020/03/23/us/coronavirus-which-states-stay-at-home-order-trnd/index.html)

stayhomeorders <- cbind.data.frame(as.vector(setdiff(unique(socCapital_alt_3$State_abb),"DC")),c("20/04/05","20/03/28","20/03/31",NA,"20/03/19","20/03/26","20/03/23","20/03/24","20/04/03","20/04/03","20/03/23","20/03/25","20/03/21","20/03/24",NA,"20/03/30","20/03/26","20/03/23","20/04/02","20/03/30","20/03/24","20/03/24","20/03/27","20/04/03","20/04/06","20/03/28",NA,"20/04/01","20/03/27","20/03/21","20/03/24","20/03/22","20/03/30",NA,"20/03/23","20/04/01","20/03/23","20/04/01","20/03/28","20/04/07",NA,"20/04/02","20/04/02",NA,"20/03/30","20/03/25","20/03/23","20/03/24","20/03/25",NA))
stayhomeorders <- rename(stayhomeorders,State_abb=`as.vector(setdiff(unique(socCapital_alt_3$State_abb), "DC"))`,StayOrder_Date=`c("20/04/05", "20/03/28", "20/03/31", NA, "20/03/19", "20/03/26", `)
stayhomeorders <- stayhomeorders %>% mutate(Stayorder_Date_date=as.Date(StayOrder_Date,"%y/%m/%d"))



# merge the read-in data

data1 <- left_join(ruralAtlas1,ruralAtlas2,by="State_abb")
data1 <- left_join(data1,ruralAtlas2,by="State_abb")
data1 <- left_join(data1,ruralAtlas4,by="State_abb")
data1 <- left_join(data1,socCapital_state,by=c("State_abb"="StateAbb"))



# add election data on the state level (taken from MIT Election Lab: https://electionlab.mit.edu/data)

elec_2016alt <- read.csv("Raw Data/countypres_2000-2016.csv")

elec_2016alt <- elec_2016alt %>% filter(year%in% c(2008,2012,2016)) %>% filter(party %in% c("democrat","republican"))
elec_2016alt <- elec_2016alt[,-7]

elec_2016alt <- elec_2016alt %>% spread(party,candidatevotes)

elec_2016alt <- elec_2016alt[,-c(4,6,8)]

elec_2016alt <- elec_2016alt %>% filter(year==2016) %>% group_by(state)%>% mutate(totalvotes_total=sum(totalvotes,na.rm=TRUE),republican_total=sum(republican,na.rm=TRUE)) %>% mutate(rep_per=republican_total/totalvotes_total)%>%mutate(rep_win=ifelse(rep_per>0.5,1,0))

elec_2016alt <- elec_2016alt %>% distinct(state,.keep_all = TRUE) 
elec_2016alt <- elec_2016alt %>%select(state,rep_per,rep_win)

# merge governor and elections data to the MRP dataset  
data1 <- left_join(data1,elec_2016alt,by=c("County.x"="state"))
data1 <- left_join(data1,governor_2020_Rep,by="State_abb")


#==========================================================================================



# The dataset compiled up to this point only draws on publicly available data.
# Researchers interested in replicating our final dataset for the MRP can merge this dataset with the Cuebiq data, once they obtained access (see read.me).

save(data1,file="GW_2020_MRPDataSet_withoutCuebiq_122220.RData") 

#load(file="GW_2020_MRPDataSet_withoutCuebiq_122220.RData") 



#===================================================================================================




# Read in Cuebiq mobility data (version from April 29, 2020 was used for the paper); see read.me on how to obtain access to the (proprietary) Cuebiq data

cuebiq_april_10 <- read.csv("Raw Data/cmi-20200429.csv")
cuebiq_april_10 <- cuebiq_april_10 %>% filter(as.Date(ref_dt)>="2020-01-01") %>% filter(as.Date(ref_dt)<"2020-04-20")


# Need county-level population data in order to aggregate Cuebiq mobility data from county to state level: each county will be weighted by relative population share

ruralAtlas2_new <- read.xlsx("County Covariates/Copy of RuralAtlasData20.xlsx",sheet = 3)
ruralAtlas2_new <- rename(ruralAtlas2_new,State_abb=State)
ruralAtlas2_new <- ruralAtlas2_new %>% mutate(County_Name=paste0(County," County"),FIPS_num=as.numeric(FIPS))

cuebiq_april_10 <- left_join(cuebiq_april_10,ruralAtlas2_new,by=c("county_fips_code"="FIPS_num"))



# Need state-level population data in order to aggregate Cuebiq mobility data from county to state level: each county will be weighted by relative population share

PopVector <- ruralAtlas2 %>% select(State_abb,County,TotalPopEst2018)
PopVector <- rename(PopVector,StatePOPULATION19=TotalPopEst2018)

cuebiq_april_10 <- left_join(cuebiq_april_10,PopVector,by="State_abb")


# create variables necessary for MRP analysis on the state level: CMI, Shelter percentage

mobility1 <- cuebiq_april_10   %>% group_by(state_name) %>% mutate(StatePOPULATION19=sum(TotalPopEst2018)/length(unique(week_name))) %>%
  mutate(cmi_new=(TotalPopEst2018/StatePOPULATION19)*cmi,shelter_new=(TotalPopEst2018/StatePOPULATION19)*sheltered_in_place) %>% group_by(state_name,week_name) %>% mutate(cmi_new_weekly=sum(cmi_new,na.rm=TRUE),shelter_new_weekly=sum(shelter_new,na.rm=TRUE)) %>% select(state_name,week_name,cmi_new_weekly,shelter_new_weekly,ref_dt,cmi_new,shelter_new) %>%
  group_by(state_name,week_name)%>%arrange(ref_dt)%>%slice(1)


# Finally, merge mobility data with remaining MRP data; merge MRP-trust variables, stay at home order dates data
data1 <- left_join(mobility1,data1,by=c("state_name"="County.x"))
data1 <- left_join(data1,trust_vector2,by=c("State_abb"="V1"))


MRP_time_series_1 <-  left_join(data1,stayhomeorders,by="State_abb")


MRP_weekly_series_1 <- MRP_time_series_1 %>% mutate(post_treatment3=if_else((as.Date(ref_dt)-Stayorder_Date_date)>(-4),1,0,missing = 0))
MRP_weekly_series_1 <- MRP_weekly_series_1 %>% mutate(week_num=as.numeric(stri_sub(week_name,7,8)))


MRP_weekly_series_1 <- MRP_weekly_series_1 %>% mutate(TrustOthers=as.numeric(as.character(TrustOthers))/100)
MRP_weekly_series_1 <- left_join(MRP_weekly_series_1,trust_vector3,by=c("State_abb"="V1"))
MRP_weekly_series_1 <- MRP_weekly_series_1 %>% mutate(NoCorruption_num=as.numeric(as.character(NoCorruption))/100)



save(MRP_weekly_series_1,file="GW_2020_MRPDataSet_122220.RData")


# @knitr EndPrepareDataMRP

#==========================================================================================
#==========================================================================================